
/* === STEP 1 ================================================================*/
/* Loading data on households and individuals ... */

import sas using "${newdata}househ_wp2", clear
describe 
rename w17_1455_lnr indid
rename aargang year
keep indid 					/// person-id
	kvinne 					/// 1 if woman
	antpers_i_regstat_famnr /// number of individuals per hh (adults + children)
	year 					/// year
	familienr 				/// family-id
	age 					/// age
	grkrets 				/// household location
	wies 					/// income net of taxes
	grputd 					/// aggregated education 
	employed 				/// 1 == Labor income above 1G
	wealth 					/// wealth, nok
	hushtyp 				/// household type
	secondhome 				/// owns a second home
	retired 				/// 1 == recieving age pension
	nr_adults 				/// number of adults in the household, not counting children above 18
	children 				/// 1 == lives with children below 18
	adult 					// 1 == above 18 and not living with parents (?)

/* === STEP 2 =============================================================== */
/* Loading data on individuals and workplace, to store geographical location */	
preserve 
	import sas using ${newdata}utvalg_atmlto, clear
	describe 
	drop if grk_bed == "" /* We don't need work relationships w/o a geographical location */
	rename w17_1455_lnr indid
	replace wlonn = -wlonn /* Making sure the highest wage is first */
	bys indid year (wlonn): keep if _n == 1 /* Keeping the work relationship with the highest wage */
	
	/* For now, only keeping firm's geographical area */
	keep indid 		/// individual id
		 year 		/// year
		 yrk_kode 	/// type of occupation
		 grk_bed 	// geographical area of workplace
	destring year, replace
	rename year yearformerge 
	tempfile grk_bed
	save `grk_bed'
restore	

destring year, replace
gen yearformerge = year 
replace yearformerge = 2014	if year > 2014
merge m:1 indid yearformerge using `grk_bed', keep(match master)
drop _merge

/* === STEP 3 =============================================================== */
/* Adding info on time, distance, toll and ptl to grk-grk-pairs */
preserve
	use "${additionaldata}distances", clear
	reshape long takst_r, i(grk1 grk2) j(year)
	rename takst_r toll
	rename weight time
	rename s dist
	rename kfelt ptl
	keep grk1 grk2 year toll time dist ptl
	tempfile grk1grk2
	save `grk1grk2'
restore

destring grkrets, gen(grk1)
destring grk_bed, gen(grk2)

/* to work */
merge m:1 grk1 grk2 year using `grk1grk2', keep(match master)
drop _merge
rename toll toll2
rename time time2
rename dist dist2
rename ptl ptl2

/* reversing the order */
rename grk2 temp
rename grk1 grk2
rename temp grk1

/* from work */
merge m:1 grk1 grk2 year using `grk1grk2', keep(match master)
drop _merge
rename toll toll1
rename time time1
rename dist dist1
rename ptl ptl1

drop grk1 grk2

/* Creating averages of to/from work, where available */
foreach var in time dist ptl toll {
	gen `var' = (`var'1 + `var'2) / 2
	replace `var' = `var'1 if `var' == .
	replace `var' = `var'2 if `var' == .
	drop `var'1 `var'2
}

replace dist = dist / 1000 /* Measure in km */
replace ptl = ptl / 1000   /* Measure in km */
compress
drop yearformerge

/* === STEP 4 ================================================================*/
/* Reshape to wide, to create hh level data: */

/* First - define (up to) two "main" household members 
	Criteria:
	Member 1:
		- Male, if available
		- With work, if available
		- Oldest
		-
	Member 2:
		- Female, if available
		- Otherwise, sorted the same way as member 1

*/

/* How many observations do we have per family? */

bys familienr year: egen antall_i_data = count(kvinne) 
tab antall_i_data if nr_adults == 0


/* Some households have no adults - in this case, everyone above 18 is defined as adult */
replace adult = 1 if nr_adults == 0 

/* === NOTE: Important command. Here we drop all children above 18 in the same household. Comment this out if you want the children kept ====================*/
drop if adult == 0
/* Making sure that by family id, year and gender, the employed is selected first, then the oldest individual */
gsort familienr year kvinne -employed -age
bys familienr year kvinne: gen first_ind = _n 
gen second_ind = first_ind if kvinne == 1
replace first_ind = . if kvinne == 1

drop antall_i_data
bys familienr year: egen antall_i_data = count(kvinne) /* how many? */
bys familienr year: egen antall_first = count(first_ind) /* how many males? */
bys familienr year: egen antall_second = count(second_ind) /* how many females? */

/* Making the second first_ind a second_ind in two-male households */
replace second_ind = 1 /// making a second_ind
	if antall_i_data >= 2 /// if there are >= 2 hh members ...
	& antall_second == 0 /// and none of them are women ...
	& antall_first == antall_i_data /// and # of hhmembers == # of first_ind
	& first_ind == 2 // choosing the "second best" first_ind to be the second_ind
replace first_ind = . if second_ind == 1 // and removing his first_ind status


/* Making the first second_ind a first_ind in two-female hhs */
replace first_ind = 1 ///
	if antall_i_data >= 2 ///
	& antall_first == 0 ///
	& antall_second == antall_i_data ///
	& second_ind == 1
replace second_ind = . if first_ind == 1	

/* Making the second second_ind the first second_ind in two-female hhs */
replace second_ind = 1 ///
	if antall_i_data >= 2 ///
	& antall_first == 0 ///
	& antall_second == antall_i_data ///
	& second_ind == 2

drop antall_first antall_second	

drop antall_i_data
gen index = . /* Creating variable for reshaping */
replace index = 1 if first_ind == 1
replace index = 2 if second_ind == 1

drop if index == . /* Dropping all household members that are neither first primary ("man") nor first secondary ("woman") */

bys year familienr: egen aux = count(index)
gen couple = (aux == 2) /* "Couple" here defined as "at least" two adults in the household */
drop aux

drop hushtyp adult nr_adults first_ind second_ind

bys familienr year: egen aux = max(children)
replace children = aux
drop aux

bys familienr year: egen aux = max(antpers_i_regstat_famnr)
replace antpers_i_regstat_famnr = aux
drop aux

bys familienr year (index): replace grkrets = grkrets[_n-1] if index == 2 & couple == 1

reshape wide indid kvinne age wies ///
	employed wealth retired grk_bed time dist ptl toll grputd yrk_kode ///
	, i(familienr year) j(index)
/* NOTE: Be careful with the female variable - only variation comes from same sex couples:
	- kvinne1 == 1 iff both (all) are female
	- kvinne2 == 0 iff both (all) are male
*/	

/*  Setting travel to work variables = 0 if
	A: working & living in same grk, or
	B: person is not employed. 	
	Otherwise, keep it as missing 
*/
forvalues i = 1/2 {
	foreach var in time dist ptl toll {
		replace `var'`i' = 0 if indid`i' != "" ///
			& `var'`i' == . ///
			& grkrets == grk_bed`i'
		replace `var'`i' = 0 if indid`i' != "" ///
			& `var'`i' == . ///
			& employed`i' == 0
	}
}

order indid2 couple children antpers_i_regstat_famnr secondhome grkrets ///
	grk_bed1 grk_bed2 time1 time2 dist1 dist2 toll1 toll2 ptl1 ptl2 ///
	age1 age2 kvinne1 kvinne2 employed1 employed2 ///
	retired1 retired2 wies1 wies2 wealth1 wealth2, a(indid1)

/* === STEP 5 ================================================================*/
/*Adding car data to the household data */

/* 5A: make hh-ind link */
preserve
	import sas using "${newdata}househ_select", clear
	keep w17_1455_lnr familienr aargang
	destring aargang, gen(year)
	drop aargang
	rename w17_1455_lnr ownerid
	tempfile ind_link_to_hh
	save `ind_link_to_hh'
	
/* 5B: merge hh id to car data */
	use "${newdata}car_stock_end_of_year", clear
	compress
	* Only keeping households (dropping cars owned by firms)
	drop if ownertype == 0
	drop ownertype

	* Only keeping cars that were part of the fleet at the end of the year
	drop if car_dis == 1 // scrapped cars
	drop if car_dis == 2 // exported cars
	drop car_dis

	* Only keeping valid cars (no motorcycles, taxis, hearses, scooters, etc.)
	drop if car_valid == 0
	keep year ownerid carid car_acq kmperday days daysused freg_DMY fuel
	
	* Merging on family id
	merge m:1 ownerid year using `ind_link_to_hh', keep(match master)
	
	compress
	tab year _merge
	drop _merge ownerid
	drop if familienr == "" /* non-matches */
	order familienr carid, a(year)
	sort familienr year carid
	rename days daysowned
	drop if daysused == 0
	drop if daysowned == 0	
	
	/* Variable used for merging cars to households in prioritized order */
	bys year familienr (freg_DMY): gen fam_car_nr = _N - _n + 1 ///
		if familienr != "" & daysused != 0
	compress
	
	/* Creating `cars' number of datasets to merge on the household data */
	local cars 3 /* How many cars to keep data on */
	scalar kmperday_min = 0 /* Minimum annual VKT allowed before it couts as missing */
	scalar kmperday_max = 200 /* Maximum annual VKT allowed before it couts as missing */
	
	* Fixing invalid VKT observations
	replace kmperday = . if kmperday < kmperday_min
	replace kmperday = . if kmperday > kmperday_max
	/* NOTE: kmperday defined as VKT split equally between days in which the car is registered, "daysused" */
	
	* Fixing days variable
	replace daysused = 366 if daysused > 366 & daysused != .
	replace daysowned = 366 if daysowned > 366 & daysowned != .
	replace daysused = 365 if daysused > 365 & daysused != . & ///
		(year != 2016 & year != 2012 & year != 2008)
	replace daysowned = 365 if daysowned > 365 & daysowned != . & ///
		(year != 2016 & year != 2012 & year != 2008)	
	tempfile cardata_w_familyid
	save `cardata_w_familyid'
restore

/* 5C: merging on cars in `cars' number of steps */
forvalues i = 1/`cars' {
	di `i'
	gen fam_car_nr = `i'
	merge 1:1 familienr year fam_car_nr ///
		using `cardata_w_familyid' ///
		, keep(match master)
	drop _merge fam_car_nr
	foreach var in carid car_acq kmperday daysowned daysused freg_DMY fuel {
		rename `var' `var'`i'
	} 
}

compress

/* === Step 6: Merge dataset with public transit information ================ */

destring grkrets, gen(grkrets_num)
destring grk_bed1, gen(grk_bed1_num)
destring grk_bed2, gen(grk_bed2_num)
// First round: FROM residence TO workplace
preserve
	use "${additionaldata}kollektivtransport_grk" , clear
	rename FRA grkrets_num
	* For individual 1
	rename TIL grk_bed1_num
	sort grkrets_num grk_bed1_num
	tempfile public1
	save `public1'
	* For individual 2
	rename grk_bed1_num grk_bed2_num
	sort grkrets_num grk_bed2_num
	tempfile public2
	save `public2'
restore

sort  grkrets_num grk_bed1_num
merge m:m grkrets_num grk_bed1_num using  `public1', keep(match master)
drop _merge
local varlist CD_TID TILBTID OMBORDTID VENTETID BOARDINGS TAKST
foreach v in `varlist' {
	rename `v' `v'_1 
}

sort  grkrets_num grk_bed2_num
merge m:m grkrets_num grk_bed2_num using  `public2', keep(match master)
drop _merge
local varlist CD_TID TILBTID OMBORDTID VENTETID BOARDINGS TAKST
foreach v in `varlist' {
	rename `v' `v'_2
}

// Second round: From workplace to residence
preserve
	use "${additionaldata}kollektivtransport_grk" , clear
	rename TIL grkrets_num 
	* For individual 1
	rename FRA grk_bed1_num 
	sort grkrets_num grk_bed1_num
	tempfile public1Rev
	save `public1Rev'
	* For individual 2
	rename grk_bed1_num grk_bed2_num
	sort grkrets_num grk_bed2_num
	tempfile public2Rev
	save `public2Rev'
restore

sort  grkrets_num grk_bed1_num
merge m:m grkrets_num grk_bed1_num using  `public1Rev', keep(match master)
drop _merge
local varlist CD_TID TILBTID OMBORDTID VENTETID BOARDINGS TAKST
foreach v in `varlist' {
	replace  `v'_1 =`v' if `v'_1==.
}
drop `varlist'

sort  grkrets_num grk_bed2_num
merge m:m grkrets_num grk_bed2_num using  `public2Rev', keep(match master)
drop _merge
local varlist CD_TID TILBTID OMBORDTID VENTETID BOARDINGS TAKST
foreach v in `varlist' {
	replace  `v'_2 =`v' if `v'_2==.
}
drop `varlist'

*======== Make adjustments to public transit variables and generate new ones 

// NB: If home and work is located in same "grunnkrets" --> Public travel distance is missing --> Need to replace . by 0
local varlist CD_TID TILBTID OMBORDTID VENTETID BOARDINGS TAKST
foreach v in `varlist' {
	forvalues i=1(1)2 {
		replace `v'_`i' = 0 if `v'_`i'==. & grkrets_num==grk_bed`i'_num
	}
}

// Make public transit variables
capt drop PublicTransitTime_* PublicVSCarTime_*  PublicTransitYes_*  PublicDiffCarTime_*
forvalues i=1(1)2 {
	** 1) Total travel time
	gen PublicTransitTime_`i' = TILBTID_`i' + OMBORDTID_`i'  + VENTETID_`i'

	** 2) Total travel time public/car
	/*Change ratio from . to 1 if individual work and live in same grunnkrets*/
	gen PublicVSCarTime_`i' = PublicTransitTime_`i' / CD_TID_`i'
	replace PublicVSCarTime_`i' = 1 if PublicTransitTime_`i'==0 & CD_TID_`i'==0   

	** 3) Total travel time public - total travel time car
	gen PublicDiffCarTime_`i' = PublicTransitTime_`i' - CD_TID_`i'


	** 4) Dummy variable (0/1) if access to Public transit 
	gen PublicTransitYes_`i' = 0
	replace PublicTransitYes_`i'=1 if PublicTransitTime_`i'!=.   
}

capt drop PublicVSCarTime_fam_mean PublicDiffCarTime_fam_mean
egen PublicVSCarTime_fam_mean = rowmean(PublicVSCarTime_1 PublicVSCarTime_2) 
egen PublicDiffCarTime_fam_mean = rowmean(PublicDiffCarTime_1 PublicDiffCarTime_2) 

capt drop grkrets_num grk_bed1_num grk_bed2_num 
capt drop CD_TID_1 TILBTID_1 OMBORDTID_1 VENTETID_1 BOARDINGS_1 TAKST_1 
capt drop FRA_2014 TIL_2014 
capt drop CD_TID_2 TILBTID_2 OMBORDTID_2 VENTETID_2 BOARDINGS_2 TAKST_2 
capt drop PublicTransitTime_1 PublicVSCarTime_1 PublicDiffCarTime_1 PublicTransitYes_1 
capt drop PublicTransitTime_2 PublicVSCarTime_2 PublicDiffCarTime_2 PublicTransitYes_2

compress
save "${newdata}hh_3_cars_endofyear", replace	

